{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import glob\n",
    "import sys\n",
    "from time import sleep\n",
    "from collections import Counter\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.tokenize import RegexpTokenizer\n",
    "import nltk\n",
    "import numpy\n",
    "import scipy\n",
    "import sklearn.datasets\n",
    "import csv\n",
    "\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk import WordNetLemmatizer\n",
    "from nltk.corpus import stopwords\n",
    "\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfTransformer\n",
    "\n",
    "import time"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_path=\"train\"\n",
    "test_path=\"test\"\n",
    "def load_train(class_name):\n",
    "    label=0\n",
    "    if class_name is \"pos\":\n",
    "        label=1\n",
    "    data=[]\n",
    "    for file in glob.glob(train_path+\"/\"+class_name+\"/*.txt\"):\n",
    "        f = open(file, \"r\")\n",
    "        data.append([f.read(),label])\n",
    "    return data\n",
    "def load_test():\n",
    "    data=[]\n",
    "    for i in range(0,25000):\n",
    "        file=test_path+\"/\"+str(i)+\".txt\"\n",
    "        f = open(file, \"r\")\n",
    "        data.append([f.read(),i])\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data=load_train(\"pos\")+load_train(\"neg\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_data=load_test()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def cleanUp(text, custom_stopwords=[]):\n",
    "    # Initilaise Lemmatizer object:\n",
    "    lemm = WordNetLemmatizer()\n",
    "    \n",
    "    # Load NLTK stopwords:\n",
    "    my_stopwords = stopwords.words('english') + custom_stopwords\n",
    "    \n",
    "    clean_text = ''\n",
    "    \n",
    "    tokenizer = RegexpTokenizer(r'\\w+')\n",
    "    words = tokenizer.tokenize(text) # word_tokenize() takes care of stripping too.\n",
    "    \n",
    "    for word in words:\n",
    "        w = lemm.lemmatize(word.lower())\n",
    "#         w=word.lower()\n",
    "#         print(w)\n",
    "#         if w not in my_stopwords and len(w)>2:\n",
    "        clean_text += w + \" \"\n",
    "    \n",
    "    return clean_text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def cleanUp_data(dataset):\n",
    "    for i in range(0,len(dataset)):\n",
    "        comment=cleanUp(dataset[i][0])\n",
    "#         comment=dataset[i][0].lower()\n",
    "#         dataset[i][0]=comment.split()\n",
    "        dataset[i][0]=comment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# cleanUp_data(train_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['For a movie that gets no respect there sure are a lot of memorable quotes listed for this gem. Imagine a movie where Joe Piscopo is actually funny! Maureen Stapleton is a scene stealer. The Moroni character is an absolute scream. Watch for Alan \"The Skipper\" Hale jr. as a police Sgt.',\n",
       " 1]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_data[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data Split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_set,valid_set=sklearn.model_selection.train_test_split(train_data,train_size=0.8,test_size=0.2,shuffle=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_x=[comment[0] for comment in train_set]\n",
    "train_y=[comment[1] for comment in train_set]\n",
    "valid_x=[comment[0] for comment in valid_set]\n",
    "valid_y=[comment[1] for comment in valid_set]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Extraction: Tf-idf & Binary Occurrence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "tfidf_vectorizer=CountVectorizer(ngram_range=(1,2))\n",
    "binary_vectorizer=CountVectorizer(ngram_range=(1,2),binary=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Logistic Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "def LogReg(x_train,y_train,x_valid,y_valid,vectorizer,mode):\n",
    "    X=vectorizer.fit_transform(x_train)\n",
    "    if mode is 'tfidf':\n",
    "        tfidf_transformer = TfidfTransformer(use_idf=False)\n",
    "        X=tfidf_transformer.fit_transform(X)\n",
    "    X_valid=vectorizer.transform(x_valid)\n",
    "    lg=LogisticRegression(C=25.0)\n",
    "    start=time.time()\n",
    "    lg.fit(X,y_train)\n",
    "    valid_pred=lg.predict(X_valid)\n",
    "    print(\"LR runtime: \", time.time()-start)\n",
    "    train_pred=lg.predict(X)\n",
    "    print(\"LR train: \", sklearn.metrics.accuracy_score(y_train, train_pred))\n",
    "    print (\"LR Accuracy: \", sklearn.metrics.accuracy_score(y_valid, valid_pred))\n",
    "#     print (sklearn.metrics.confusion_matrix(y_valid, valid_pred, labels=[1, 0]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Linear SVM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.svm import LinearSVC\n",
    "def SVM(x_train,y_train,x_valid,y_valid,vectorizer,mode,C=25):\n",
    "    X=vectorizer.fit_transform(x_train)\n",
    "    if mode is 'tfidf':\n",
    "        tfidf_transformer = TfidfTransformer(use_idf=False)\n",
    "        X=tfidf_transformer.fit_transform(X)\n",
    "    X_valid=vectorizer.transform(x_valid)\n",
    "    clf=LinearSVC(penalty='l2',dual=True, random_state=0, tol=1e-8, C=C) # dual optimization: n_samples<n_features\n",
    "    start=time.time()\n",
    "    clf.fit(X,y_train)\n",
    "    y_pred=clf.predict(X_valid)\n",
    "    print(\"SVM runtime: \", time.time()-start)\n",
    "    train_pred=clf.predict(X)\n",
    "    print(\"SVM train: \", sklearn.metrics.accuracy_score(y_train, train_pred))\n",
    "    print (\"SVM Accuracy: \", sklearn.metrics.accuracy_score(y_valid, y_pred))\n",
    "#     print (sklearn.metrics.confusion_matrix(y_test, y_pred, labels=[1, 0]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Test for TF-IDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tfidf: \n",
      "\n",
      "LR runtime:  18.403308868408203\n",
      "LR train:  0.997\n",
      "LR Accuracy:  0.8938\n",
      "\n",
      "\n",
      "SVM runtime:  33.03994083404541\n",
      "SVM train:  1.0\n",
      "SVM Accuracy:  0.8952\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(\"tfidf: \\n\")\n",
    "LogReg(train_x,train_y,valid_x,valid_y,tfidf_vectorizer,'tfidf')\n",
    "print(\"\\n\")\n",
    "SVM(train_x,train_y,valid_x,valid_y,tfidf_vectorizer,'tfidf')\n",
    "print(\"\\n\")\n",
    "# naiveBys(train_x,train_y,valid_x,valid_y,tfidf_vectorizer,'tfidf')\n",
    "# print(\"\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Test for Binary Occurrence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "binary occurrence: \n",
      "\n",
      "LR runtime:  18.782140970230103\n",
      "LR train:  1.0\n",
      "LR Accuracy:  0.897\n",
      "\n",
      "\n",
      "SVM runtime:  30.687812089920044\n",
      "SVM train:  1.0\n",
      "SVM Accuracy:  0.892\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(\"binary occurrence: \\n\")\n",
    "LogReg(train_x,train_y,valid_x,valid_y,binary_vectorizer,'binary')\n",
    "print(\"\\n\")\n",
    "SVM(train_x,train_y,valid_x,valid_y,binary_vectorizer,'binary')\n",
    "print(\"\\n\")\n",
    "# naiveBys(train_x,train_y,valid_x,valid_y,binary_vectorizer,'binary')\n",
    "# print(\"\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Predict Test Set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "cleanUp_data(train_data)\n",
    "cleanUp_data(test_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "total_train_x=[comment[0] for comment in train_data]\n",
    "total_train_y=[comment[1] for comment in train_data]\n",
    "test_x=[comment[0] for comment in test_data]\n",
    "test_id=[comment[1] for comment in test_data]\n",
    "vectorizer = CountVectorizer(ngram_range=(1, 2))\n",
    "tfidf_transformer = TfidfTransformer(use_idf=False)\n",
    "train_X=vectorizer.fit_transform(total_train_x)\n",
    "train_X=tfidf_transformer.fit_transform(train_X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.svm import LinearSVC\n",
    "clf=LinearSVC(penalty='l2',dual=True, random_state=0, tol=1e-8, C=100)\n",
    "clf.fit(train_X,total_train_y)\n",
    "test_X=vectorizer.transform(test_x)\n",
    "y_pred=clf.predict(test_X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "df=pd.DataFrame(data={'Id':test_id,'Category':y_pred},columns=['Id','Category'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv('svm_1-2gram_c100.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Experiment for C and n-gram Values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "C= 0.001\n",
      "unigram: \n",
      "SVM runtime:  0.21471309661865234\n",
      "SVM train:  0.70145\n",
      "SVM Accuracy:  0.6998\n",
      "bigram: \n",
      "SVM runtime:  0.7077751159667969\n",
      "SVM train:  0.7056\n",
      "SVM Accuracy:  0.6916\n",
      "trigram: \n",
      "SVM runtime:  0.6712040901184082\n",
      "SVM train:  0.8579\n",
      "SVM Accuracy:  0.7332\n",
      "unigram+bigram\n",
      "SVM runtime:  0.8202559947967529\n",
      "SVM train:  0.70405\n",
      "SVM Accuracy:  0.7016\n",
      "unigram+bigram+trigram\n",
      "SVM runtime:  2.0368740558624268\n",
      "SVM train:  0.7016\n",
      "SVM Accuracy:  0.6946\n",
      "C= 0.01\n",
      "unigram: \n",
      "SVM runtime:  0.2434697151184082\n",
      "SVM train:  0.7884\n",
      "SVM Accuracy:  0.7742\n",
      "bigram: \n",
      "SVM runtime:  0.8285892009735107\n",
      "SVM train:  0.8043\n",
      "SVM Accuracy:  0.7778\n",
      "trigram: \n",
      "SVM runtime:  1.0861780643463135\n",
      "SVM train:  0.8907\n",
      "SVM Accuracy:  0.749\n",
      "unigram+bigram\n",
      "SVM runtime:  1.010408878326416\n",
      "SVM train:  0.7966\n",
      "SVM Accuracy:  0.7856\n",
      "unigram+bigram+trigram\n",
      "SVM runtime:  2.660511016845703\n",
      "SVM train:  0.7922\n",
      "SVM Accuracy:  0.78\n",
      "C= 0.1\n",
      "unigram: \n",
      "SVM runtime:  0.30602526664733887\n",
      "SVM train:  0.87765\n",
      "SVM Accuracy:  0.8598\n",
      "bigram: \n",
      "SVM runtime:  0.8047351837158203\n",
      "SVM train:  0.9339\n",
      "SVM Accuracy:  0.855\n",
      "trigram: \n",
      "SVM runtime:  1.514115810394287\n",
      "SVM train:  0.97775\n",
      "SVM Accuracy:  0.801\n",
      "unigram+bigram\n",
      "SVM runtime:  1.2684941291809082\n",
      "SVM train:  0.89955\n",
      "SVM Accuracy:  0.8644\n",
      "unigram+bigram+trigram\n",
      "SVM runtime:  2.6465680599212646\n",
      "SVM train:  0.9091\n",
      "SVM Accuracy:  0.8614\n",
      "C= 1.0\n",
      "unigram: \n",
      "SVM runtime:  1.0808119773864746\n",
      "SVM train:  0.94105\n",
      "SVM Accuracy:  0.8882\n",
      "bigram: \n",
      "SVM runtime:  2.5645129680633545\n",
      "SVM train:  0.9997\n",
      "SVM Accuracy:  0.8788\n",
      "trigram: \n",
      "SVM runtime:  3.772345781326294\n",
      "SVM train:  1.0\n",
      "SVM Accuracy:  0.8416\n",
      "unigram+bigram\n",
      "SVM runtime:  3.3782432079315186\n",
      "SVM train:  0.9864\n",
      "SVM Accuracy:  0.8926\n",
      "unigram+bigram+trigram\n",
      "SVM runtime:  6.382461309432983\n",
      "SVM train:  0.9962\n",
      "SVM Accuracy:  0.89\n",
      "C= 10.0\n",
      "unigram: \n",
      "SVM runtime:  4.5438947677612305\n",
      "SVM train:  0.9924\n",
      "SVM Accuracy:  0.8906\n",
      "bigram: \n",
      "SVM runtime:  15.108246088027954\n",
      "SVM train:  1.0\n",
      "SVM Accuracy:  0.886\n",
      "trigram: \n",
      "SVM runtime:  28.160925149917603\n",
      "SVM train:  1.0\n",
      "SVM Accuracy:  0.847\n",
      "unigram+bigram\n",
      "SVM runtime:  19.599112033843994\n",
      "SVM train:  1.0\n",
      "SVM Accuracy:  0.8954\n",
      "unigram+bigram+trigram\n",
      "SVM runtime:  51.671658992767334\n",
      "SVM train:  1.0\n",
      "SVM Accuracy:  0.8928\n",
      "C= 100.0\n",
      "unigram: \n",
      "SVM runtime:  5.210240125656128\n",
      "SVM train:  1.0\n",
      "SVM Accuracy:  0.8766\n",
      "bigram: \n",
      "SVM runtime:  24.26181983947754\n",
      "SVM train:  1.0\n",
      "SVM Accuracy:  0.8856\n",
      "trigram: \n",
      "SVM runtime:  50.6845269203186\n",
      "SVM train:  1.0\n",
      "SVM Accuracy:  0.8474\n",
      "unigram+bigram\n",
      "SVM runtime:  36.91319394111633\n",
      "SVM train:  1.0\n",
      "SVM Accuracy:  0.8954\n",
      "unigram+bigram+trigram\n",
      "SVM runtime:  63.83374786376953\n",
      "SVM train:  1.0\n",
      "SVM Accuracy:  0.893\n",
      "C= 1000.0\n",
      "unigram: \n",
      "SVM runtime:  5.003766059875488\n",
      "SVM train:  1.0\n",
      "SVM Accuracy:  0.8726\n",
      "bigram: \n",
      "SVM runtime:  23.958541870117188\n",
      "SVM train:  1.0\n",
      "SVM Accuracy:  0.8854\n",
      "trigram: \n",
      "SVM runtime:  62.27902102470398\n",
      "SVM train:  1.0\n",
      "SVM Accuracy:  0.8476\n",
      "unigram+bigram\n",
      "SVM runtime:  31.42343783378601\n",
      "SVM train:  1.0\n",
      "SVM Accuracy:  0.8956\n",
      "unigram+bigram+trigram\n",
      "SVM runtime:  69.24173903465271\n",
      "SVM train:  1.0\n",
      "SVM Accuracy:  0.8932\n"
     ]
    }
   ],
   "source": [
    "i=1e-3\n",
    "vectorizer_1=CountVectorizer(ngram_range=(1,1))\n",
    "vectorizer_2=CountVectorizer(ngram_range=(2,2))\n",
    "vectorizer_3=CountVectorizer(ngram_range=(3,3))\n",
    "vectorizer_4=CountVectorizer(ngram_range=(1,2))\n",
    "vectorizer_5=CountVectorizer(ngram_range=(1,3))\n",
    "while i<=1e3:\n",
    "    print(\"C=\",i)\n",
    "    print(\"unigram: \")\n",
    "    SVM(train_x,train_y,valid_x,valid_y,vectorizer_1,'tfidf',i)\n",
    "    print(\"bigram: \")\n",
    "    SVM(train_x,train_y,valid_x,valid_y,vectorizer_2,'tfidf',i)\n",
    "    print(\"trigram: \")\n",
    "    SVM(train_x,train_y,valid_x,valid_y,vectorizer_3,'tfidf',i)\n",
    "    print(\"unigram+bigram\")\n",
    "    SVM(train_x,train_y,valid_x,valid_y,vectorizer_4,'tfidf',i)\n",
    "    print(\"unigram+bigram+trigram\")\n",
    "    SVM(train_x,train_y,valid_x,valid_y,vectorizer_5,'tfidf',i)\n",
    "    i=i*10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
